import os
from collections import Counter
from heapq import heappush, heappop
from math import log
import io
import re
from pprint import pprint
# DS
import numpy as np
from scipy import spatial
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
# visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(10,8)})
import plotly.express as px
px.set_mapbox_access_token(os.getenv('MAPBOX_TOKEN'))
from langdetect import detect, detect_langs
import spacy
from spacy import displacy
from gensim.models import CoherenceModel
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
# Allow multiple outputs for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# suppress auto-conversion to scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# display all columns
pd.set_option('display.max_columns', None)
# display 100 rows
pd.set_option('display.max_rows', 20)
random_state = 10
Airbnb Sydney Data: https://www.kaggle.com/tylerx/sydney-airbnb-open-data
listings = pd.read_csv('data/sydney-airbnb-open-data/listings_dec18.csv')
listings.shape
listings.head()
listings = listings[listings['description'].notnull()]
listings.shape
result = []
for val in listings['description'].values:
try:
result.append(detect(val))
except:
result.append(None)
listings['lang'] = result
listings['lang'].value_counts(dropna=False)[:10]
listings = listings[listings['lang'] == 'en']
listings = listings[listings['lang'].notnull()]
listings.shape
notable features
# listings.to_csv('data/tmp/listings_sample.csv', index=False)
listings.to_csv('data/tmp/listings.csv', index=False)
en_core_web_lg: English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.
nlp = spacy.load('en_core_web_lg')
# df = pd.read_csv('data/tmp/listings_sample.csv')
df = pd.read_csv('data/tmp/listings.csv')
def clean(txt):
"""
1. remove stop words, punctuations
2. lemmatize if possible (past tense -> present tense, etc)
3. cast to lower case
"""
doc = nlp(txt)
newdoc = []
for token in doc:
if token.is_stop or token.is_punct or not token.is_alpha:
pass
elif token.text != token.lemma_ and token.lemma_ != '-PRON-':
newdoc += token.lemma_,
else:
newdoc += token.lower_,
return " ".join(newdoc)
df['description_cleaned'] = df['description'].apply(clean)
def parse_entity(txt):
""" extract named entities """
doc = nlp(txt)
newdoc = [token.text for token in doc if token.ent_type_]
return newdoc
def remove_entity(txt):
""" remove named entities """
doc = nlp(txt)
newdoc = [token.text for token in doc if not token.ent_type_]
return " ".join(newdoc)
df['entities'] = df['description_cleaned'].apply(parse_entity)
df['description_cleaned'] = df['description_cleaned'].apply(remove_entity)
c = Counter()
for ents in df['entities'].values:
for ent in set(ents):
c[ent] += 1
ents = pd.DataFrame(sorted(list(c.items()), key=lambda x: x[1], reverse=True), columns=['entity', 'count'])
ents['entity_pct'] = ents['count'] / len(df)
ents.head(20)
ents.to_csv('data/tmp/entity_count.csv', index=False)
# def clean(txt):
# """ remove nouns """
# doc = nlp(txt)
# newdoc = [token.text for token in doc if token.pos_ == 'NOUN' or token.pos_ == 'VERB']
# return " ".join(newdoc)
# df['description_cleaned'] = df['description_cleaned'].apply(clean)
get term frequencies (in terms of number of documents that a term appears in)
terms = []
for doc in df['description_cleaned'].values:
row = list(set(doc.split()))
terms.extend(row)
c = Counter(terms)
terms = pd.DataFrame(sorted(list(c.items()), key=lambda x: x[1], reverse=True), columns=['term', 'count'])
terms['term_frequency'] = terms['count'] / len(df)
check for out of vocab terms
def check_vocab(word):
token = next(iter(nlp(word)))
return token.has_vector
terms['has_vector'] = terms['term'].apply(check_vocab)
terms['has_vector'] = terms['has_vector'].astype(int)
terms['has_vector'].value_counts()
terms['has_vector'].describe()
terms.head(20)
def clean(txt):
""" remove out-of-vocab terms """
doc = nlp(txt)
newdoc = [token.text for token in doc if token.has_vector]
return " ".join(newdoc)
df['description_cleaned'] = df['description_cleaned'].apply(clean)
df.shape
df = df[df['description_cleaned'].str.strip() != '']
df.shape
df['description_cleaned'].str.split(' ').str.len().describe()
Remove words with low inverse document frequency
Where $\text{df}=$The number of documents a term appears in, and $N=$the number of documents
If a term appears in every document, idf=0. The rarer a term is, the higher idf is.
terms = terms[terms['has_vector'] == 1]
terms['term_frequency'] = terms['count'] / len(df)
terms.head()
terms['idf'] = np.log10(len(df) / (terms['count'] + 1))
terms['tfidf'] = terms['idf'] * terms['term_frequency']
terms.describe()
terms.head()
px.histogram(terms, x='idf', title='Inverse Document Frequency')
px.scatter(terms, x='term_frequency', y='idf', hover_name='term')
px.parallel_coordinates(terms[['term_frequency', 'idf', 'tfidf']],
color='tfidf', color_continuous_scale=px.colors.sequential.RdBu)
N = len(df)
p = 0.05
treshhold_idf = np.log10(N / (N * p))
treshhold_idf
# threshhold idf of terms that appear in 5% of all documents
# words to remove
terms[terms['idf'] < treshhold_idf]
terms[terms['idf'] < treshhold_idf].to_csv('data/tmp/remove_words.csv', index=False)
terms[terms['idf'] >= treshhold_idf].to_csv('data/tmp/term_freq.csv', index=False)
remove = set(terms.loc[terms['idf'] < treshhold_idf, 'term'])
len(remove)
remove
def clean(txt):
"""
remove words with high inverse document frequency
"""
doc = nlp(txt)
newdoc = []
for token in doc:
if token.text in remove:
continue
else:
newdoc += token.text,
return " ".join(newdoc)
df['description_cleaned'] = df['description_cleaned'].apply(clean)
df.shape
df = df[df['description_cleaned'].notnull()]
df = df[df['description_cleaned'].str.strip() != '']
df.reset_index(inplace=True)
df.shape
df.head()
df.to_csv('data/tmp/listings_nlp.csv', index=False)
# df.to_csv('data/tmp/listings_sample_nlp.csv', index=False)
df = pd.read_csv('data/tmp/listings_nlp.csv')
df.shape
df.head()
df['description'].apply(lambda x: re.findall('\w+', x)).str.len().describe()
df['description_cleaned'].str.split(' ').str.len().describe()
df['host_is_superhost'] = df['host_is_superhost'] == 't'
df['host_is_superhost'] = df['host_is_superhost'].astype(int)
df['host_is_superhost'].value_counts()
df['neighbourhood'].value_counts()
df['property_type'].value_counts()
df['room_type'].value_counts()
cols = ['host_listings_count', 'number_of_reviews', 'review_scores_rating']
df[cols] = df[cols].astype(float)
df[cols].describe()
host_listings_count: mostly private, with some outlier having 276 listingsnumber_of_reviews: same as abovereview_scores_rating: majority above 90, outlier with minimum of 20df['description_cleaned'].str.split().explode().value_counts()
df['entities'] = df['entities'].apply(lambda x: re.findall('\w+', x))
df['entities'].explode().value_counts()
if nlp == None:
nlp = spacy.load('en_core_web_lg')
txt = df['description'].values[0]
doc = nlp(txt)
displacy.render(doc, style='ent')
df = pd.read_csv('data/tmp/listings_nlp.csv')
df.shape
df.head()
!python -m spacy init-model en data/nlp/wiki-news-300d-1M --vectors-loc data/nlp/wiki-news-300d-1M.vec.zip
fasttext = spacy.load("data/nlp/wiki-news-300d-1M")
def word_repr(txt):
doc = fasttext(txt)
return doc.vector
vectors = df['description_cleaned'].apply(word_repr)
vectors = np.array(vectors)
vectors = np.stack(vectors)
vectors.shape
k_candidates = [5, 10, 15, 20, 50, 75, 100, 150, 300]
Within-Cluster-Sum of Squared Errors (WSS)
kmeans.inertia_
def simulate(k, vectors):
kmeans = KMeans(n_clusters=k, n_jobs=8, random_state=random_state)
kmeans.fit(vectors)
return kmeans.inertia_
result = []
for k in k_candidates:
dist = simulate(k, vectors)
result.append([k, dist])
print(k, dist)
result = np.array(result)
plt.plot(result[:,0], result[:,1])
plt.xlabel('k')
plt.ylabel('WSS')
plt.title('Within-Cluster-Sum of Squared Errors (WSS / Inertia)')
Silhoutte Score
def simulate(k, vectors, metric='cosine'):
"""
metric: cosine, euclidean, l1, l2, manhattan
"""
kmeans = KMeans(n_clusters=k, n_jobs=8, random_state=random_state)
return silhouette_score(vectors, kmeans.fit_predict(vectors), metric=metric)
result = []
for k in k_candidates:
dist = simulate(k, vectors)
result.append([k, dist])
print(k, dist)
result = np.array(result)
plt.plot(result[:,0], result[:,1])
plt.xlabel('k')
plt.ylabel('silhoutte score')
plt.title('silhoutte score')
%matplotlib inline
for n_clusters in k_candidates:
# Create a subplot with 1 row and 2 columns
fig, ax1 = plt.subplots(figsize=(12, 14))
fig.set_size_inches(18, 7)
# The silhouette coefficient can range from -1, 1
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(vectors) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
cluster_labels = clusterer.fit_predict(vectors)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(vectors, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(vectors, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title(f"Silhouette plot for {n_clusters} clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
OPTIMUM_K = 5
remove = pd.read_csv('data/tmp/remove_words.csv')
remove = set(remove['term'])
kmeans = KMeans(n_clusters=OPTIMUM_K, n_jobs=8, random_state=random_state)
y = kmeans.fit_predict(vectors)
df['cluster'] = y
# start cluster from 1
df['cluster'] = df['cluster'].astype(int) + 1
df['cluster'] = df['cluster'].astype(str)
silhouette_score(vectors, y)
kmeans.inertia_
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = np.array(list(map(float, tokens[1:])))
return data
fasttext_vec = load_vectors(fname='data/nlp/wiki-news-300d-1M.vec')
clusters = {}
for k, centroid in enumerate(kmeans.cluster_centers_, 1):
h = []
print(k)
for idx, (word, rep) in enumerate(fasttext_vec.items()):
if idx % 100000 == 0 and idx != 0:
print(idx, h)
if word not in remove:
# dist = np.linalg.norm(centroid - rep.reshape((-1, 1)))
dist = spatial.distance.cosine(centroid, rep.reshape((-1, 1)))
heappush(h, (-dist, word))
if len(h) > 10:
_ = heappop(h)
clusters[k] = [item[1] for item in h]
print(", ".join(clusters[k]))
for k, rep in clusters.items():
print(f'k={k}: {sorted(rep)}')
# Only words unique to each cluster
keywords = Counter(np.hstack(list(clusters.values())))
for k, rep in clusters.items():
print(f'k={k}:', [word for word in rep if keywords[word] == 1])
df['cluster'] = df['cluster'].astype(str)
px.scatter_mapbox(df, lat='latitude', lon='longitude', color='cluster')
# top 10 word frequency in each cluster
for k in sorted(df['cluster'].unique()):
print(k)
ct = len(df[df['cluster'] == k])
df.loc[df['cluster'] == k, 'description_cleaned'].str.split(' ').explode().value_counts()[:10] / ct
np.savetxt('data/tmp/word_embeddings.csv', vectors)
df.to_csv('data/tmp/listings_clusters.csv', index=False)
vectors = np.loadtxt('data/tmp/word_embeddings.csv')
df = pd.read_csv('data/tmp/listings_clusters.csv')
id2word = corpora.Dictionary(df['description_cleaned'].str.split(' ').values)
corpus = [id2word.doc2bow(text) for text in df['description_cleaned'].str.split(' ').values]
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=random_state)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
start = 8
limit = 41
step = 4
model_list, coherence_values = compute_coherence_values(dictionary=id2word,
corpus=corpus,
texts=df['description_cleaned'].str.split(' ').values,
start=start,
limit=limit,
step=step)
print('Coherence scores')
n_topics = iter(range(start, limit, step))
for score in coherence_values:
print(next(n_topics), ':', score)
plt.plot(range(start, limit, step), coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# Select the model
idx = coherence_values.index(max(coherence_values))
optimal_model = model_list[idx]
print('Optimal n topics:', optimal_model.num_topics)
print('Maximum coherence value:', max(coherence_values))
# Compute Perplexity - a measure of how good the model is. lower the better.
print('Perplexity: ', optimal_model.log_perplexity(corpus))
# print the topics
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))
# visualize modelled topics and export graph
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
pyLDAvis.save_html(vis, 'data/topics.html')
vis
Get dominant topic for each listing description
def format_topics_sentences(ldamodel, corpus, texts):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_dominant_topic = format_topics_sentences(ldamodel=optimal_model,
corpus=corpus,
texts=df['description'].values)
df_dominant_topic.reset_index(inplace=True)
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# start topic from 1
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].astype(int) + 1
df_dominant_topic.head()
df['dominant_topic'] = df_dominant_topic['Dominant_Topic']
df['dominant_topic'] = df['dominant_topic'].astype(str)
# Group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()
sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf = pd.concat([sent_topics_sorteddf,
grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(1)],
axis=0)
Most representative document by topic
Topic_Perc_Contrib: % of listing description that falls under the given topicKeywords: keywords of the topicText: The listing description that best represents the given topicsent_topics_sorteddf.drop('Document_No', 1, inplace=True)
sent_topics_sorteddf.reset_index(drop=True, inplace=True)
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
sent_topics_sorteddf
# does not seem to translate to any geographical pattern / trend
px.scatter_mapbox(df[df['dominant_topic'].isin(['1', '2', '12'])], lat='latitude', lon='longitude', color='dominant_topic')
df.to_csv('data/tmp/listings_clusters.csv', index=False)